{
 "cells": [
  {
   "cell_type": "code",
   "execution_count": 1,
   "id": "d628efe4-7470-4076-a65c-9c0e1a86ac8c",
   "metadata": {},
   "outputs": [],
   "source": [
    "import pymysql\n",
    "from pymysql import Connection\n",
    "from pymysql.cursors import DictCursor\n",
    "def get_connection(autocommit: bool = True) -> Connection:\n",
    "    db_conf = {\n",
    "            \"host\": \"192.168.98.55\",\n",
    "            \"port\": 4000,\n",
    "            \"user\": \"dataware_house_testUser\",\n",
    "            \"password\": \"IlGiUL2qcdqckoIzj6c4\",\n",
    "            \"database\": \"dataware_house_test\",\n",
    "            \"autocommit\": autocommit,\n",
    "            # \"cursorclass\": DictCursor,\n",
    "        }\n",
    "    conn:Connection = pymysql.connect(**db_conf)\n",
    "    return conn"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 2,
   "id": "46d9770b-5182-4730-b692-beb2190371b6",
   "metadata": {},
   "outputs": [
    {
     "name": "stderr",
     "output_type": "stream",
     "text": [
      "C:\\Users\\Administrator\\AppData\\Local\\Temp\\ipykernel_24004\\726147944.py:4: UserWarning: pandas only supports SQLAlchemy connectable (engine/connection) or database string URI or sqlite3 DBAPI2 connection. Other DBAPI2 objects are not tested. Please consider using SQLAlchemy.\n",
      "  df = pd.read_sql(sql, conn)\n"
     ]
    },
    {
     "data": {
      "text/html": [
       "<div>\n",
       "<style scoped>\n",
       "    .dataframe tbody tr th:only-of-type {\n",
       "        vertical-align: middle;\n",
       "    }\n",
       "\n",
       "    .dataframe tbody tr th {\n",
       "        vertical-align: top;\n",
       "    }\n",
       "\n",
       "    .dataframe thead th {\n",
       "        text-align: right;\n",
       "    }\n",
       "</style>\n",
       "<table border=\"1\" class=\"dataframe\">\n",
       "  <thead>\n",
       "    <tr style=\"text-align: right;\">\n",
       "      <th></th>\n",
       "      <th>keyid</th>\n",
       "      <th>title</th>\n",
       "      <th>author</th>\n",
       "      <th>pub_year</th>\n",
       "      <th>vol</th>\n",
       "      <th>num</th>\n",
       "      <th>begin_page</th>\n",
       "      <th>end_page</th>\n",
       "    </tr>\n",
       "  </thead>\n",
       "  <tbody>\n",
       "    <tr>\n",
       "      <th>0</th>\n",
       "      <td>000013237A9FC89FAE20C0198431AE5454FDF</td>\n",
       "      <td>桂林南药股份有限公司薪酬分配优化方案</td>\n",
       "      <td>曾晴[1]</td>\n",
       "      <td>2019</td>\n",
       "      <td></td>\n",
       "      <td>23</td>\n",
       "      <td>121</td>\n",
       "      <td>122</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>1</th>\n",
       "      <td>000012D3A7C0E9062D7E27C0025AA36874260</td>\n",
       "      <td>互联网交互设计发展趋势及现状分析</td>\n",
       "      <td>赵丽</td>\n",
       "      <td>2015</td>\n",
       "      <td></td>\n",
       "      <td>14</td>\n",
       "      <td>70</td>\n",
       "      <td></td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>2</th>\n",
       "      <td>000012DE03C4587E99F724429E5F364520ED0</td>\n",
       "      <td>四川省生猪产业国内竞争力探析</td>\n",
       "      <td>肖亮[1];黄智良[2];李泉河[1]</td>\n",
       "      <td>2021</td>\n",
       "      <td></td>\n",
       "      <td>31</td>\n",
       "      <td>127</td>\n",
       "      <td>129</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>3</th>\n",
       "      <td>000012DF2E4ACEDFF41201182E3870F259CAF</td>\n",
       "      <td>区块链综述和创新应用场景探究</td>\n",
       "      <td>汤佳霖[1]</td>\n",
       "      <td>2019</td>\n",
       "      <td></td>\n",
       "      <td>22</td>\n",
       "      <td>144</td>\n",
       "      <td>145</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>4</th>\n",
       "      <td>000012E74A8F5D708D86A7E16BA4B0A625D18</td>\n",
       "      <td>区块链在电子商务中的应用分析</td>\n",
       "      <td>刘扬波[1]</td>\n",
       "      <td>2021</td>\n",
       "      <td></td>\n",
       "      <td>15</td>\n",
       "      <td>16</td>\n",
       "      <td>18</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>...</th>\n",
       "      <td>...</td>\n",
       "      <td>...</td>\n",
       "      <td>...</td>\n",
       "      <td>...</td>\n",
       "      <td>...</td>\n",
       "      <td>...</td>\n",
       "      <td>...</td>\n",
       "      <td>...</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>15750</th>\n",
       "      <td>00001DFFA1CE0F3DF2F84AA0A5893B6B15E4F</td>\n",
       "      <td>试析企业并购整合过程中的问题和对策</td>\n",
       "      <td>郭红霞</td>\n",
       "      <td>2015</td>\n",
       "      <td></td>\n",
       "      <td>17</td>\n",
       "      <td>49</td>\n",
       "      <td>50</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>15751</th>\n",
       "      <td>00001E03CF075C489BAC578AA0847D9B5B277</td>\n",
       "      <td>论企业融资成本对资本结构调整的影响</td>\n",
       "      <td>蔡菊娥[1]</td>\n",
       "      <td>2019</td>\n",
       "      <td></td>\n",
       "      <td>16</td>\n",
       "      <td>70</td>\n",
       "      <td>72</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>15752</th>\n",
       "      <td>00001E044B44B79D778073DF6620DEB4C1B2D</td>\n",
       "      <td>公共巨灾保险研究——基于海南民生保障视角及宁波公共巨灾保险经验</td>\n",
       "      <td>郑晓玲[1]</td>\n",
       "      <td>2019</td>\n",
       "      <td></td>\n",
       "      <td>22</td>\n",
       "      <td>152</td>\n",
       "      <td>154</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>15753</th>\n",
       "      <td>00001E15B42A9D0E532FFF6251506FBCCD4D6</td>\n",
       "      <td>企业现金流量管理水平提升策略</td>\n",
       "      <td>朱庆义[1]</td>\n",
       "      <td>2021</td>\n",
       "      <td></td>\n",
       "      <td>31</td>\n",
       "      <td>61</td>\n",
       "      <td>63</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>15754</th>\n",
       "      <td>00001E16369EA3D2DE00CE47A2F7ADAC4A0F9</td>\n",
       "      <td>组织行为学在人力资源管理中的应用</td>\n",
       "      <td>陈磊[1]</td>\n",
       "      <td>2021</td>\n",
       "      <td></td>\n",
       "      <td>15</td>\n",
       "      <td>128</td>\n",
       "      <td>130</td>\n",
       "    </tr>\n",
       "  </tbody>\n",
       "</table>\n",
       "<p>15755 rows × 8 columns</p>\n",
       "</div>"
      ],
      "text/plain": [
       "                                       keyid                            title  \\\n",
       "0      000013237A9FC89FAE20C0198431AE5454FDF               桂林南药股份有限公司薪酬分配优化方案   \n",
       "1      000012D3A7C0E9062D7E27C0025AA36874260                 互联网交互设计发展趋势及现状分析   \n",
       "2      000012DE03C4587E99F724429E5F364520ED0                   四川省生猪产业国内竞争力探析   \n",
       "3      000012DF2E4ACEDFF41201182E3870F259CAF                   区块链综述和创新应用场景探究   \n",
       "4      000012E74A8F5D708D86A7E16BA4B0A625D18                   区块链在电子商务中的应用分析   \n",
       "...                                      ...                              ...   \n",
       "15750  00001DFFA1CE0F3DF2F84AA0A5893B6B15E4F                试析企业并购整合过程中的问题和对策   \n",
       "15751  00001E03CF075C489BAC578AA0847D9B5B277                论企业融资成本对资本结构调整的影响   \n",
       "15752  00001E044B44B79D778073DF6620DEB4C1B2D  公共巨灾保险研究——基于海南民生保障视角及宁波公共巨灾保险经验   \n",
       "15753  00001E15B42A9D0E532FFF6251506FBCCD4D6                   企业现金流量管理水平提升策略   \n",
       "15754  00001E16369EA3D2DE00CE47A2F7ADAC4A0F9                 组织行为学在人力资源管理中的应用   \n",
       "\n",
       "                    author pub_year vol num begin_page end_page  \n",
       "0                    曾晴[1]     2019      23        121      122  \n",
       "1                       赵丽     2015      14         70           \n",
       "2      肖亮[1];黄智良[2];李泉河[1]     2021      31        127      129  \n",
       "3                   汤佳霖[1]     2019      22        144      145  \n",
       "4                   刘扬波[1]     2021      15         16       18  \n",
       "...                    ...      ...  ..  ..        ...      ...  \n",
       "15750                  郭红霞     2015      17         49       50  \n",
       "15751               蔡菊娥[1]     2019      16         70       72  \n",
       "15752               郑晓玲[1]     2019      22        152      154  \n",
       "15753               朱庆义[1]     2021      31         61       63  \n",
       "15754                陈磊[1]     2021      15        128      130  \n",
       "\n",
       "[15755 rows x 8 columns]"
      ]
     },
     "execution_count": 2,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "import pandas as pd\n",
    "sql = \"select keyid,title,author,pub_year,vol,num,begin_page,end_page from dataware_house_test.base_fact_title_journal WHERE gch = '88422A'\"\n",
    "with get_connection(autocommit=False) as conn:\n",
    "    df = pd.read_sql(sql, conn)\n",
    "df"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 6,
   "id": "5bce330c-b91a-4dce-a166-0a0c1f1e1b12",
   "metadata": {},
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "[{'keyid': '000013237A9FC89FAE20C0198431AE5454FDF', 'title': '桂林南药股份有限公司薪酬分配优化方案', 'author': '曾晴[1]', 'pub_year': '2019', 'vol': '', 'num': '23', 'begin_page': '121', 'end_page': '122'}, {'keyid': '000012D3A7C0E9062D7E27C0025AA36874260', 'title': '互联网交互设计发展趋势及现状分析', 'author': '赵丽', 'pub_year': '2015', 'vol': '', 'num': '14', 'begin_page': '70', 'end_page': ''}, {'keyid': '000012DE03C4587E99F724429E5F364520ED0', 'title': '四川省生猪产业国内竞争力探析', 'author': '肖亮[1];黄智良[2];李泉河[1]', 'pub_year': '2021', 'vol': '', 'num': '31', 'begin_page': '127', 'end_page': '129'}, {'keyid': '000012DF2E4ACEDFF41201182E3870F259CAF', 'title': '区块链综述和创新应用场景探究', 'author': '汤佳霖[1]', 'pub_year': '2019', 'vol': '', 'num': '22', 'begin_page': '144', 'end_page': '145'}, {'keyid': '000012E74A8F5D708D86A7E16BA4B0A625D18', 'title': '区块链在电子商务中的应用分析', 'author': '刘扬波[1]', 'pub_year': '2021', 'vol': '', 'num': '15', 'begin_page': '16', 'end_page': '18'}, {'keyid': '0000133684A3C6BF717DFCB55D4B73DD1CDEE', 'title': '2015年3月份各式皮包市场月度分析报告', 'author': '', 'pub_year': '2015', 'vol': '', 'num': '18', 'begin_page': '15', 'end_page': '25'}, {'keyid': '0000133A34A8B1B8774C54AE8814C1E229727', 'title': '论注册制下的监管制度建设', 'author': '陈文萍[1]', 'pub_year': '2019', 'vol': '', 'num': '16', 'begin_page': '172', 'end_page': '173'}, {'keyid': '0000134C7277412BDAB1BB1487A629EBE6963', 'title': '平台是否滥用了市场支配地位?——以南京市生鲜平台为例', 'author': '田春晖[1]', 'pub_year': '2021', 'vol': '', 'num': '31', 'begin_page': '3', 'end_page': '8'}, {'keyid': '0000134DB2E3C59EBE77B666B8A61C3E2DF9B', 'title': '场景时代下社交电商平台运营模式对比分析——以拼多多、云集为例', 'author': '曾俊[1];洪乐[1];邹鹏羽[1]', 'pub_year': '2020', 'vol': '', 'num': '18', 'begin_page': '7', 'end_page': '8'}, {'keyid': '00001J1W6HD88JPW6PC1Q', 'title': '非同一控制下企业合并报表编制研究', 'author': '谢华英[1]', 'pub_year': '2017', 'vol': '', 'num': '30', 'begin_page': '95', 'end_page': '96'}]\n"
     ]
    }
   ],
   "source": [
    "result = df.to_dict(orient='records')\n",
    "print(result[:10])"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 12,
   "id": "9f5a7f51-ae44-4035-97e2-44a49c753c62",
   "metadata": {},
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "29.52308201789856\n"
     ]
    },
    {
     "name": "stderr",
     "output_type": "stream",
     "text": [
      "100%|█████████████████████████████████████████████████████████████████████████| 15755/15755 [00:00<00:00, 23237.46it/s]"
     ]
    },
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "44\n",
      "0.6800005435943604\n"
     ]
    },
    {
     "name": "stderr",
     "output_type": "stream",
     "text": [
      "\n"
     ]
    }
   ],
   "source": [
    "import jieba\n",
    "from datasketch import MinHash, MinHashLSH\n",
    "import time\n",
    "\n",
    "# 示例数据\n",
    "documents = result\n",
    "\n",
    "# 初始化jieba分词器\n",
    "jieba.initialize()\n",
    "\n",
    "# 创建MinHash和LSH对象\n",
    "# threshold Jaccard 距离阈值设定，默认为0.9\n",
    "lsh = MinHashLSH(threshold=0.7, num_perm=128)\n",
    "minhashes = {}\n",
    "start_time = time.time()\n",
    "# 处理文本，计算MinHash\n",
    "for i, doc in enumerate(documents):\n",
    "    words = jieba.cut(doc[\"title\"])\n",
    "    m = MinHash(num_perm=128)\n",
    "    for word in words:\n",
    "        m.update(word.encode('utf8'))\n",
    "    minhashes[doc[\"keyid\"]] = m\n",
    "    lsh.insert(doc[\"keyid\"], m)\n",
    "end1 = time.time()\n",
    "print(end1 - start_time)\n",
    "# 查找重复文档\n",
    "from tqdm import tqdm\n",
    "max_num = 0\n",
    "result_max = []\n",
    "k = \"\"\n",
    "for keyid, minhash in tqdm(minhashes.items()):\n",
    "    result1 = lsh.query(minhash)\n",
    "    if len(result1) > max_num:\n",
    "        max_num = len(result1)\n",
    "        result_max = result1\n",
    "        k = keyid\n",
    "end2 = time.time()\n",
    "print(max_num)\n",
    "print(end2 - end1)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 13,
   "id": "bb948eb6-39cf-47a1-a139-f8d14d2ffb2a",
   "metadata": {},
   "outputs": [
    {
     "data": {
      "text/plain": [
       "['00001JV16PBO1JH1MJD00',\n",
       " '00001D05EFD3598857ABC1D43F293D2D3484D',\n",
       " '00001J1W6DC82J5XMJCPQ',\n",
       " '00001JV16PD84JD06NC00',\n",
       " '00001JV1MPD86J5WMLD80',\n",
       " '00001JV16PBO2J90M9D89',\n",
       " '00001JV1MPD89JLVMPC05',\n",
       " '00001JV16PBO3J1X6PDG0',\n",
       " '00001JV16PD84JDVMNCG1',\n",
       " '00001J1W6DDO2JDVMND1Q',\n",
       " '00001JV16PDO7J11M9BO9',\n",
       " '00001JV16PC05J1X6DBO8',\n",
       " '00001J1W6DD01J916DBPQ',\n",
       " '00001JV16PDO2JVXMBCO1',\n",
       " '00001JV16PDG6JLWMFD00',\n",
       " '00001JV16PC83JPWMJCO6',\n",
       " '00001JV16PDO7J1169BG5',\n",
       " '00001J1W6DC82J506LD9Q',\n",
       " '00001J1W6HDG8IL1M7D9Q',\n",
       " '00001JV16PCO0ILWMHBO6',\n",
       " '00001JV16PC07J5V6LCO4',\n",
       " '00001JV1MPD87J5WMNDO8',\n",
       " '00001JV1MPD89JLVMLCO7',\n",
       " '00001JV16PBO2J90M9D83',\n",
       " '00001JV16PDG6JLWMDD88',\n",
       " '00001JV16PBO4JD06BC04',\n",
       " '00001JV16PC03JHW6ND88',\n",
       " '00001JV1MPDO4IPWMHD88',\n",
       " '00001JV16PDG4J9WMLD08',\n",
       " '00001J1W6LCO7ILV67BPQ',\n",
       " '00001JV16PD82JDWMPC08',\n",
       " '00001JV1MPDG8JVWMPDG8',\n",
       " '00001JV16PD84J9X6PD07',\n",
       " '00001JV16PD84JD0MFCG2',\n",
       " '00001JV16PDG6JLWMFBG8',\n",
       " '00001F05016DB94B701F28FF5FA9FDA3FC937',\n",
       " '00001F13025CAFF4F284007602D7753C079B5',\n",
       " '00001J1W6LCG2JLVMFBPQ',\n",
       " '00001JV16PBG1J9W6JDO8',\n",
       " '00001JV16PDG6JLWMLCG0',\n",
       " '00001JV16PDG6JLWMLCG2',\n",
       " '00001JV1MPD86J5WMLCO8',\n",
       " '00001J1W6JCO2JV1MDC9Q',\n",
       " '00001JV1MPD87J5WMPBG5']"
      ]
     },
     "execution_count": 13,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "result_max"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "id": "e38353c6-c759-4f34-9347-9fe399a56d97",
   "metadata": {},
   "outputs": [],
   "source": []
  }
 ],
 "metadata": {
  "kernelspec": {
   "display_name": "Python 3 (ipykernel)",
   "language": "python",
   "name": "python3"
  },
  "language_info": {
   "codemirror_mode": {
    "name": "ipython",
    "version": 3
   },
   "file_extension": ".py",
   "mimetype": "text/x-python",
   "name": "python",
   "nbconvert_exporter": "python",
   "pygments_lexer": "ipython3",
   "version": "3.12.4"
  }
 },
 "nbformat": 4,
 "nbformat_minor": 5
}
